library(aplore3)
library(ggplot2)
library(dplyr)
library(plotly)
library(ca)
library(RColorBrewer)
library(pheatmap)
library(cluster)
library(ggcorrplot)
library(dplyr)
library(tidyr)
library(sjPlot)
library(sjmisc)
library(caret)
library(vcd)
library(FactoMineR)
#??glow500
Data Format: A data.frame with 500 rows and 18 variables such as:
priorfrac - If the patient previously had a fracture
age - Age at
Enrollment (years)
weight - Weight at enrollment (Kilograms)
height - Height at enrollment (Centimeters)
bmi - Body Mass Index
(Kg/m^2)
premeno - Menopause before age 45 (1: No, 2: Yes)
momfrac - Mother had hip fracture (1: No, 2: Yes)
armassist - Arms
are needed to stand from a chair (1: No, 2: Yes)
smoke - Former or
current smoker (1: No, 2: Yes)
raterisk - Self-reported risk of
fracture (1: Less than others of the same age, 2: Same as others of the
same age, 3: Greater than others of the same age
fracscore -
Fracture Risk Score (Composite Risk Score)
fracture - Any fracture
in first year (1: No, 2: Yes)
bonemed - Bone medications at
enrollment (1: No, 2: Yes)
bonemed_fu - Bone medications at
follow-up (1: No, 2: Yes)
bonetreat - Bone medications both at
enrollment and follow-up (1: No, 2: Yes)
head(glow_bonemed)
## sub_id site_id phy_id priorfrac age weight height bmi premeno momfrac
## 1 1 1 14 No 62 70.3 158 28.16055 No No
## 2 2 4 284 No 65 87.1 160 34.02344 No No
## 3 3 6 305 Yes 88 50.8 157 20.60936 No Yes
## 4 4 6 309 No 82 62.1 160 24.25781 No No
## 5 5 1 37 No 61 68.0 152 29.43213 No No
## 6 6 5 299 Yes 67 68.0 161 26.23356 No No
## armassist smoke raterisk fracscore fracture bonemed bonemed_fu bonetreat
## 1 No No Same 1 No No No No
## 2 No No Same 2 No No No No
## 3 Yes No Less 11 No No No No
## 4 No No Less 5 No No No No
## 5 No No Same 1 No No No No
## 6 No Yes Same 4 No No No No
Summary statistics for numeric variables
#Set fracscore to integer for summary statistics
glow_bonemed$fracscore = as.integer(glow_bonemed$fracscore)
mysummary = glow_bonemed %>%
select(age, weight, height, bmi, fracscore) %>%
summarise_each(
funs(min = min,
q25 = quantile(., 0.25),
median = median,
q75 = quantile(., 0.75),
max = max,
mean = mean,
sd = sd,
variance= var))
# reshape it using tidyr functions
clean.summary = mysummary %>%
gather(stat, val) %>%
separate(stat, into = c("var", "stat"), sep = "_") %>%
spread(stat, val) %>%
select(var, min, max, mean, sd, variance)
print(clean.summary)
## var min max mean sd variance
## 1 age 55.00000 90.00000 68.56200 8.989537 80.811780
## 2 bmi 14.87637 49.08241 27.55303 5.973958 35.688178
## 3 fracscore 0.00000 11.00000 3.69800 2.495446 6.227251
## 4 height 134.00000 199.00000 161.36400 6.355493 40.392289
## 5 weight 39.90000 127.00000 71.82320 16.435992 270.141825
Summary statistics for categorical variables
summary(glow_bonemed %>% select(priorfrac, premeno, momfrac, armassist, smoke, raterisk, fracture, bonemed, bonemed_fu, bonetreat))
## priorfrac premeno momfrac armassist smoke raterisk fracture
## No :374 No :403 No :435 No :312 No :465 Less :167 No :375
## Yes:126 Yes: 97 Yes: 65 Yes:188 Yes: 35 Same :186 Yes:125
## Greater:147
## bonemed bonemed_fu bonetreat
## No :371 No :361 No :382
## Yes:129 Yes:139 Yes:118
##
No missing values
colSums(is.na(glow_bonemed))
## sub_id site_id phy_id priorfrac age weight height
## 0 0 0 0 0 0 0
## bmi premeno momfrac armassist smoke raterisk fracscore
## 0 0 0 0 0 0 0
## fracture bonemed bonemed_fu bonetreat
## 0 0 0 0
sum(is.na(glow_bonemed))
## [1] 0
Age vs Weight: As weight increases the average age decreases
Age
vs Height: Weak correlation of as height increases age decreases
Age
vs BMI: As bmi increases the average age decreases
Age vs fracscore:
As age increases the average fracscore increases
Weight vs Height: As height increases the average weight
increases
Weight vs BMI: As bmi increases the average weight
increases
Weight vs fracscore: As fracscore increases the average
Weight decreases
Height vs BMI: As bmi increases the average height and variance stay
the same
Height vs fracscore: As fracscore increases the average
height stays the same though variance might decrease
BMI vs fracscore: As fracscore increases the average bmi
decreases
plot(glow_bonemed[, c(5:8, 14)])
Non of the following scatter plots show strong groupings for the fracture/no fracture categorical variable
ggplot(glow_bonemed, aes(x = age, y = bmi, color = fracture)) +
geom_jitter() +
labs(title = "BMI vs Age")
ggplot(glow_bonemed, aes(x = bmi, y = fracscore, color = fracture)) +
geom_jitter() +
labs(title = "Fracture Score vs BMI")
ggplot(glow_bonemed, aes(x = fracscore, y = age, color = fracture)) +
geom_jitter() +
labs(title = "Age vs Fracture Score")
ggplot(glow_bonemed, aes(x = weight, y = height, color = fracture)) +
geom_jitter() +
labs(title = "Height vs Weight")
Once again there doesn’t seem to be strong groupings of the fracture categorical variable
fracture3dplot = plot_ly(glow_bonemed,
x = ~age,
y = ~height,
z = ~bmi,
color = ~fracture,
colors = c('#0C4B8E', '#BF382A')) %>% add_markers()
fracture3dplot
There are so little “yes” fracture results that the plot isn’t very useful
## `geom_smooth()` using formula = 'y ~ x'
The boxplot shows that the mean fracscore seems to be slightly higher for smokers compared to non smokers
ggplot(glow_bonemed, aes(x = smoke, y = fracscore)) +
geom_boxplot() +
labs(title = "Fracture Score Summary Statistics for Smokers vs Non Smokers")
Plot confirms there is a strong correlation between age/fracscore, bmi/weight
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
corr_df <- glow_bonemed[, corr_vars]
corr_df <- cor(corr_df)
ggcorrplot(corr = corr_df, lab = TRUE, lab_size = 2,
colors = c("#6D9EC1", "white", "#E46726")) +
labs(title = "Correlation Between Variables") +
theme(plot.title = element_text(hjust = .5),
plot.subtitle = element_text(hjust = .5))
mosaicplot(table(glow_bonemed$priorfrac, glow_bonemed$fracture), color = TRUE)
chisq.test(table(glow_bonemed$priorfrac, glow_bonemed$fracture))
##
## Pearson's Chi-squared test with Yates' continuity correction
##
## data: table(glow_bonemed$priorfrac, glow_bonemed$fracture)
## X-squared = 22.635, df = 1, p-value = 1.959e-06
Perform chi-squared test on all categorical variables
glow_bonemed$fracscore = as.factor(glow_bonemed$fracscore)
multipleCorrespondenceAnalysis = MCA(glow_bonemed[, c(4, 9:18)], graph = FALSE)
multipleCorrespondenceAnalysis
## **Results of the Multiple Correspondence Analysis (MCA)**
## The analysis was performed on 500 individuals, described by 11 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. of the categories"
## 4 "$var$cos2" "cos2 for the categories"
## 5 "$var$contrib" "contributions of the categories"
## 6 "$var$v.test" "v-test for the categories"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "intermediate results"
## 12 "$call$marge.col" "weights of columns"
## 13 "$call$marge.li" "weights of rows"
plot.MCA(multipleCorrespondenceAnalysis,
cex = 0.8,
col.quali.sup = c("red", "blue", "green"),
title = "Multiple Correspondence Analysis")
#glow_bonemed[, c(4, 9:18)]
#chisq.test(categoricalVariablesTable)
pheatmap(glow_bonemed[, c(5,8)], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))
pheatmap(glow_bonemed[, 5:8], scale = "column", fontsize_row = 0.1, cluster_cols = F, legend = T, color = colorRampPalette(c("blue", "white", "red"), space = "rgb")(100))
zScoreScale = scale(glow_bonemed[, 5:8])
zScoreDistance = dist(zScoreScale)
continuousVariableClustering = hclust(zScoreDistance, method = "complete")
plot(continuousVariableClustering)
Split the data into a training/testing set
set.seed(4)
trainingIndices = sample(c(1:dim(glow_bonemed)[1]), dim(glow_bonemed)[1]*0.8)
trainingDataframe = glow_bonemed[trainingIndices,]
testingDataframe = glow_bonemed[-trainingIndices,]
Age is the only statistically significant continuous variable at the alpha = 0.2 level (p < 0.0001)
model = glm(fracture ~ age + weight + height + bmi, data = glow_bonemed[trainingIndices,], family = "binomial")
summary(model)
##
## Call:
## glm(formula = fracture ~ age + weight + height + bmi, family = "binomial",
## data = glow_bonemed[trainingIndices, ])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2770 -0.7951 -0.6672 1.2602 1.9710
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.006590 13.891104 -0.432 0.665447
## age 0.045888 0.013481 3.404 0.000664 ***
## weight -0.039155 0.095610 -0.410 0.682156
## height 0.008938 0.085578 0.104 0.916814
## bmi 0.113632 0.247382 0.459 0.645993
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 458.45 on 399 degrees of freedom
## Residual deviance: 442.53 on 395 degrees of freedom
## AIC: 452.53
##
## Number of Fisher Scoring iterations: 4
AIC(model)
## [1] 452.5326
# trying to figure out how to use sjPlot to mimic what we did in unit12 prelive
#plot_model(model, type = "pred", terms = c("age", "smoke"))
Get eigen vectors and values for principle component analysis
glow_bonemed$fracscore = as.integer(glow_bonemed$fracscore)
corr_vars <- c("age", "weight", "height", "bmi", "fracscore")
pc.result<-prcomp(glow_bonemed[, corr_vars],scale.=TRUE)
#Eigen Vectors
pc.result$rotation
## PC1 PC2 PC3 PC4 PC5
## age 0.4947219 0.46742140 -0.15246583 0.71654567 -0.009160237
## weight -0.5273035 0.46578775 -0.08840991 0.03240244 -0.704362523
## height -0.2345770 -0.08196149 -0.93823245 0.01885633 0.240042129
## bmi -0.4741030 0.51615173 0.24533677 0.05137380 0.667820563
## fracscore 0.4442985 0.53984137 -0.16872342 -0.69463484 0.013601399
#Eigen Values
eigenvals<-pc.result$sdev^2
eigenvals
## [1] 2.374116591 1.523507236 0.975710317 0.123469514 0.003196342
Scree plot
par(mfrow = c(1, 2))
plot(eigenvals,type = "l",
main = "Scree Plot",
ylab = "Eigen Values",
xlab = "PC #")
plot(eigenvals / sum(eigenvals),
type = "l", main = "Scree Plot",
ylab = "Prop. Var. Explained",
xlab = "PC #",
ylim = c(0, 1))
cumulative.prop = cumsum(eigenvals / sum(eigenvals))
lines(cumulative.prop, lty = 2)
Naive Bayes model with categorical variables
corr_vars <- c("priorfrac", "premeno", "momfrac", "armassist", "smoke", "raterisk", "fracture", "bonemed", "bonemed_fu", "bonetreat")
set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
nb.fit = train(fracture ~ .,
data = trainingDataframe[, corr_vars],
method = "nb",
trControl = fitControl
)
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 16
nb.fit
## Naive Bayes
##
## 400 samples
## 9 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE 0.6731191 0.163332
## TRUE 0.7400891 0.000000
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE and adjust
## = 1.
Knn models
corr_vars <- c("age", "weight", "height", "bmi", "fracscore", "fracture")
set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
knn.fit = train(fracture ~ .,
data = trainingDataframe[, corr_vars],
method = "knn",
trControl = fitControl,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
knn.fit
## k-Nearest Neighbors
##
## 400 samples
## 5 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.6793168 0.15146923
## 2 0.6643074 0.09772090
## 3 0.6869418 0.07266771
## 4 0.6624969 0.01579128
## 5 0.6802533 -0.01485883
## 6 0.6999484 0.02039872
## 7 0.7047624 -0.01348597
## 8 0.7000094 -0.02251761
## 9 0.7149515 -0.01036712
## 10 0.7298358 0.07106741
## 20 0.7350829 0.02719129
## 30 0.7400891 0.00000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 30.
After tinkering it seems the best model only uses the age variable
corr_vars <- c("age", "weight", "height", "bmi", "fracscore", "fracture")
set.seed(1234)
fitControl = trainControl(method = "repeatedcv", number = 10, repeats = 1)
knn.fit = train(fracture ~ age,
data = trainingDataframe[, corr_vars],
method = "knn",
trControl = fitControl,
tuneGrid = expand.grid(k = c(1:10, 20, 30))
)
knn.fit
## k-Nearest Neighbors
##
## 400 samples
## 1 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 1 times)
## Summary of sample sizes: 359, 361, 360, 360, 359, 361, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 1 0.7178299 0.04451770
## 2 0.7252048 0.04095402
## 3 0.7302720 0.05828196
## 4 0.7202689 0.03291690
## 5 0.7327720 0.05605753
## 6 0.7328971 0.06388898
## 7 0.7454644 0.08867516
## 8 0.7428361 0.07440775
## 9 0.7454644 0.07173952
## 10 0.7276438 -0.01393495
## 20 0.7299578 0.01388032
## 30 0.7302079 -0.00185982
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
#Sources:
Hosmer, D.W., Lemeshow, S. and Sturdivant, R.X. (2013)
Applied Logistic Regression, 3rd ed., New York: Wiley
https://cran.r-project.org/web/packages/aplore3/aplore3.pdf#page=11&zoom=100,132,90